Load the required packages:
library(tidyverse)
library(glue)
library(scales)
library(Matrix)
library(SingleCellExperiment)
library(pheatmap)
library(viridis)Global parameters:
result_dir <- file.path("..", "simulated_ovarian_analysis", "report_data")
sample_ids <- c("SK-OV-3", "IGROV-1", "OVMANA", "OVKATE",
"OVTOKO", "COV362")
min_tpm <- 1
min_cor_spearman <- 0.5
max_cor_spearman <- 1.0
min_mean_rel_diff <- 0.3
max_mean_rel_diff <- 1.0
heatmap_palette <- cividis
heatmap_palette_direction <- 1
scatterplot_df_list <- list()Helper functions:
fill_missing_matrix <- function(x, all_rownames) {
missing_rownames <- setdiff(all_rownames, rownames(x))
missing_matrix <- matrix(
0, nrow = length(missing_rownames), ncol = ncol(x)
)
rownames(missing_matrix) <- missing_rownames
full_matrix <- rbind(x, missing_matrix)
full_matrix <- full_matrix[all_rownames,]
return(full_matrix)
}
get_mean_rel_diff <- function(x, y) {
selector <- (x != 0) & (y != 0)
x <- x[selector]
y <- y[selector]
mean(abs(x - y) / ((x + y) / 2))
}
get_mean_rel_diff_matrix <- function(x, y) {
apply(y, 2, function(y_values) {
apply(x, 2, function(x_values) {
selector <- (y_values != 0) & (x_values != 0)
y_values <- y_values[selector]
x_values <- x_values[selector]
mean(abs(x_values - y_values) / ((x_values + y_values) / 2))
})
})
}
max_no_diag <- function(x, idx) {
x[idx] <- NA
max(x, na.rm = TRUE)
}
min_no_diag <- function(x, idx) {
x[idx] <- NA
min(x, na.rm = TRUE)
}
get_cor_diff <- function(cor_matrix) {
output <- sapply(seq(nrow(cor_matrix)), function(idx) {
x <- cor_matrix[idx,]
x[seq(nrow(cor_matrix))] - max_no_diag(x, idx)
})
colnames(output) <- rownames(cor_matrix)
return(output)
}
get_rel_diff_diff <- function(rel_diff_matrix) {
output <- sapply(seq(nrow(rel_diff_matrix)), function(idx) {
x <- rel_diff_matrix[idx,]
min_no_diag(x, idx) - x[seq(nrow(rel_diff_matrix))]
})
colnames(output) <- rownames(rel_diff_matrix)
return(output)
}
prepare_scatterplot_data <- function(tpm_matrix_bulk,
tpm_matrix_pseudobulk) {
tpm_bulk_df <- as.data.frame(tpm_matrix_bulk)
bulk_levels <- colnames(tpm_bulk_df)
tpm_bulk_df$transcript_id <- rownames(tpm_bulk_df)
rownames(tpm_bulk_df) <- NULL
tpm_bulk_df <- tpm_bulk_df %>%
gather(-transcript_id, key = "sample_bulk", value = "tpm_bulk")
tpm_bulk_df$sample_bulk <- factor(
tpm_bulk_df$sample_bulk, levels = bulk_levels
)
tpm_pseudobulk_df <- as.data.frame(tpm_matrix_pseudobulk)
pseudobulk_levels <- colnames(tpm_pseudobulk_df)
tpm_pseudobulk_df$transcript_id <- rownames(tpm_pseudobulk_df)
rownames(tpm_pseudobulk_df) <- NULL
tpm_pseudobulk_df <- tpm_pseudobulk_df %>%
gather(-transcript_id, key = "sample_pseudobulk",
value = "tpm_pseudobulk")
tpm_pseudobulk_df$sample_pseudobulk <- factor(
tpm_pseudobulk_df$sample_pseudobulk, levels = pseudobulk_levels
)
tpm_df <- left_join(tpm_bulk_df, tpm_pseudobulk_df)
tpm_df$status <- ifelse(
as.numeric(tpm_df$sample_bulk) ==
as.numeric(tpm_df$sample_pseudobulk),
"Correct vs Correct", "Correct vs Decoy"
)
tpm_df$status <- factor(
tpm_df$status,
levels = c("Correct vs Correct", "Correct vs Decoy")
)
return(tpm_df)
}
create_scatterplot_paired <- function(plot_df) {
plot_df$tpm_bulk[plot_df$tpm_bulk < 0.1] <- 0.1
plot_df$tpm_pseudobulk[plot_df$tpm_pseudobulk < 0.1] <- 0.1
scatter_plot <- ggplot(plot_df, mapping = aes(x = tpm_bulk,
y = tpm_pseudobulk)) +
geom_point(alpha = 0.35, size = 0.05) +
geom_abline(intercept = 0, slope = 1,
lty = 3, color = "red") +
facet_grid(rows = vars(sample_pseudobulk),
cols = vars(sample_bulk)) +
scale_x_log10(labels = trans_format("log10", math_format(10^.x))) +
scale_y_log10(labels = trans_format("log10", math_format(10^.x))) +
coord_cartesian(xlim = c(0.1, 10^5),
ylim = c(0.1, 10^5),
clip = "off") +
annotation_logticks(outside = TRUE,
size = 0.25,
short = unit(0.05, "cm"),
mid = unit(0.1, "cm"),
long = unit(0.15, "cm")) +
theme_bw() +
theme(aspect.ratio = 1,
axis.title.x = element_blank(),
axis.title.y = element_blank())
return(scatter_plot)
}
create_scatterplot_combined <- function(plot_df) {
plot_df$tpm_bulk[plot_df$tpm_bulk < 0.1] <- 0.1
plot_df$tpm_pseudobulk[plot_df$tpm_pseudobulk < 0.1] <- 0.1
scatter_plot <- ggplot(plot_df, mapping = aes(x = tpm_bulk,
y = tpm_pseudobulk)) +
geom_point(alpha = 0.35, size = 0.05) +
geom_abline(intercept = 0, slope = 1,
lty = 3, color = "red") +
facet_grid(cols = vars(status)) +
scale_x_log10(labels = trans_format("log10", math_format(10^.x))) +
scale_y_log10(labels = trans_format("log10", math_format(10^.x))) +
coord_cartesian(xlim = c(0.1, 10^5),
ylim = c(0.1, 10^5),
clip = "off") +
annotation_logticks(outside = TRUE,
size = 0.25,
short = unit(0.05, "cm"),
mid = unit(0.1, "cm"),
long = unit(0.15, "cm")) +
theme_bw() +
theme(aspect.ratio = 1,
axis.title.x = element_blank(),
axis.title.y = element_blank())
return(scatter_plot)
}Prepare Isosceles pseudobulk RNA-Seq data:
isosceles_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
se_pseudobulk <- readRDS(file.path(
result_dir, glue("isosceles_{sample_id}_Rep1_se_pseudobulk_transcript.rds")
))
})
tpm_pseudobulk <- do.call(
cbind, lapply(isosceles_pseudobulk_list, function(se) {
assay(se, "tpm")
})
)
colnames(tpm_pseudobulk) <- sample_idsPrepare Isosceles bulk RNA-Seq data:
isosceles_bulk_list <- lapply(sample_ids, function(sample_id) {
se_transcript <- readRDS(file.path(
result_dir, glue("isosceles_{sample_id}_Rep2_se_transcript.rds")
))
})
tpm_bulk <- do.call(
cbind, lapply(isosceles_bulk_list, function(se) {
assay(se, "tpm")
})
)
colnames(tpm_bulk) <- sample_idsSelect top transcripts for further analysis:
top_transcripts <- apply(tpm_pseudobulk, 1, mean)
top_transcripts <- names(top_transcripts[top_transcripts >= min_tpm])
length(top_transcripts)## [1] 27044
Re-normalize the TPM values:
tpm_bulk <- tpm_bulk[top_transcripts,]
tpm_pseudobulk <- tpm_pseudobulk[top_transcripts,]
tpm_bulk <- t(
t(tpm_bulk) / colSums(tpm_bulk) * 1e6
)
tpm_pseudobulk <- t(
t(tpm_pseudobulk) / colSums(tpm_pseudobulk) * 1e6
)Calculate the correlation matrices:
cor_spearman <- cor(tpm_pseudobulk[top_transcripts,],
tpm_bulk[top_transcripts,],
method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
tpm_pseudobulk[top_transcripts,],
tpm_bulk[top_transcripts,]
)Spearman correlation matrix:
as.data.frame(cor_spearman)Spearman correlation heatmap:
pheatmap(
cor_spearman,
color = heatmap_palette(100, direction = heatmap_palette_direction),
breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)Mean relative difference matrix:
as.data.frame(mean_rel_diff)Mean relative difference heatmap:
pheatmap(
mean_rel_diff,
color = heatmap_palette(100, direction = -1),
breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)TPM scatter plots:
scatterplot_df <- prepare_scatterplot_data(tpm_bulk,
tpm_pseudobulk)
scatterplot_df$tool <- "Isosceles"
scatterplot_df_list[["Isosceles"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)create_scatterplot_combined(scatterplot_df)Prepare IsoQuant pseudobulk RNA-Seq data:
isoquant_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
isoquant_df <- read_delim(file.path(
result_dir, glue("isoquant_{sample_id}_Rep1_transcript_grouped_counts.tsv")
))
isoquant_transcript_ids <- isoquant_df[, 1, drop = TRUE]
isoquant_counts <- as(as.matrix(isoquant_df[, c(-1)]), "dgCMatrix")
rownames(isoquant_counts) <- isoquant_transcript_ids
isoquant_pseudobulk_counts <- rowSums(isoquant_counts)
isoquant_pseudobulk_tpm <- isoquant_pseudobulk_counts /
sum(isoquant_pseudobulk_counts) * 1e6
return(isoquant_pseudobulk_tpm)
})
isoquant_pseudobulk_tx_ids <- unique(unlist(sapply(
isoquant_pseudobulk_list, names
)))
isoquant_pseudobulk_tpm <- sapply(isoquant_pseudobulk_list,
function(tpm_values) {
tpm_values <- tpm_values[isoquant_pseudobulk_tx_ids]
tpm_values[is.na(tpm_values)] <- 0
return(tpm_values)
})
rownames(isoquant_pseudobulk_tpm) <- isoquant_pseudobulk_tx_ids
colnames(isoquant_pseudobulk_tpm) <- sample_idsPrepare IsoQuant bulk RNA-Seq data:
isoquant_bulk_list <- lapply(sample_ids, function(sample_id) {
isoquant_df <- read_delim(file.path(
result_dir, glue("isoquant_{sample_id}_Rep2_transcript_tpm.tsv")
))
})
isoquant_bulk_tx_ids <- unique(unlist(sapply(
isoquant_bulk_list, function(df) {df[, 1, drop = TRUE]}
)))
isoquant_bulk_tpm <- sapply(isoquant_bulk_list, function(df) {
tpm_values <- df$TPM
names(tpm_values) <- df[, 1, drop = TRUE]
tpm_values <- tpm_values[isoquant_bulk_tx_ids]
tpm_values[is.na(tpm_values)] <- 0
return(tpm_values)
})
rownames(isoquant_bulk_tpm) <- isoquant_bulk_tx_ids
colnames(isoquant_bulk_tpm) <- sample_ids
isoquant_bulk_tpm <- fill_missing_matrix(isoquant_bulk_tpm,
rownames(isoquant_pseudobulk_tpm))
isoquant_bulk_tpm <- t(
t(isoquant_bulk_tpm) / colSums(isoquant_bulk_tpm) * 1e6
)Select top transcripts for further analysis:
top_transcripts <- apply(isoquant_pseudobulk_tpm, 1, mean)
top_transcripts <- names(top_transcripts[top_transcripts >= min_tpm])
length(top_transcripts)## [1] 22313
Re-normalize the TPM values:
isoquant_bulk_tpm <- isoquant_bulk_tpm[top_transcripts,]
isoquant_pseudobulk_tpm <- isoquant_pseudobulk_tpm[top_transcripts,]
isoquant_bulk_tpm <- t(
t(isoquant_bulk_tpm) / colSums(isoquant_bulk_tpm) * 1e6
)
isoquant_pseudobulk_tpm <- t(
t(isoquant_pseudobulk_tpm) / colSums(isoquant_pseudobulk_tpm) * 1e6
)Calculate the correlation matrices:
cor_spearman <- cor(isoquant_pseudobulk_tpm[top_transcripts,],
isoquant_bulk_tpm[top_transcripts,],
method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
isoquant_pseudobulk_tpm[top_transcripts,],
isoquant_bulk_tpm[top_transcripts,]
)Spearman correlation matrix:
as.data.frame(cor_spearman)Spearman correlation heatmap:
pheatmap(
cor_spearman,
color = heatmap_palette(100, direction = heatmap_palette_direction),
breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)Mean relative difference matrix:
as.data.frame(mean_rel_diff)Mean relative difference heatmap:
pheatmap(
mean_rel_diff,
color = heatmap_palette(100, direction = -1),
breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)TPM scatter plots:
scatterplot_df <- prepare_scatterplot_data(isoquant_bulk_tpm,
isoquant_pseudobulk_tpm)
scatterplot_df$tool <- "IsoQuant"
scatterplot_df_list[["IsoQuant"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)create_scatterplot_combined(scatterplot_df)Prepare FLAMES pseudobulk RNA-Seq data:
flames_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
flames_df <- read_delim(file.path(
result_dir, glue("flames_{sample_id}_Rep1_transcript_count.csv.gz")
))
flames_df <- flames_df[!grepl("_", flames_df$transcript_id),]
flames_transcript_ids <- flames_df$transcript_id
flames_counts <- as(as.matrix(flames_df[, c(-1, -2)]), "dgCMatrix")
rownames(flames_counts) <- flames_transcript_ids
flames_pseudobulk_counts <- rowSums(flames_counts)
flames_pseudobulk_tpm <- flames_pseudobulk_counts /
sum(flames_pseudobulk_counts) * 1e6
return(flames_pseudobulk_tpm)
})
flames_pseudobulk_tx_ids <- unique(unlist(sapply(
flames_pseudobulk_list, names
)))
flames_pseudobulk_tpm <- sapply(flames_pseudobulk_list,
function(tpm_values) {
tpm_values <- tpm_values[flames_pseudobulk_tx_ids]
tpm_values[is.na(tpm_values)] <- 0
return(tpm_values)
})
rownames(flames_pseudobulk_tpm) <- flames_pseudobulk_tx_ids
colnames(flames_pseudobulk_tpm) <- sample_idsPrepare FLAMES bulk RNA-Seq data:
flames_bulk_list <- lapply(sample_ids, function(sample_id) {
flames_df <- read_csv(file.path(
result_dir, glue("flames_{sample_id}_Rep2_transcript_count.csv.gz")
))
flames_df <- flames_df[!grepl("_", flames_df$transcript_id),]
return(flames_df)
})
flames_bulk_tx_ids <- unique(unlist(sapply(
flames_bulk_list, function(df) {df$transcript_id}
)))
flames_bulk_tpm <- sapply(flames_bulk_list, function(df) {
tpm_values <- df[, 3, drop = TRUE]
names(tpm_values) <- df$transcript_id
tpm_values <- tpm_values[flames_bulk_tx_ids]
tpm_values[is.na(tpm_values)] <- 0
return(tpm_values)
})
rownames(flames_bulk_tpm) <- flames_bulk_tx_ids
colnames(flames_bulk_tpm) <- sample_ids
flames_bulk_tpm <- t(t(flames_bulk_tpm) / colSums(flames_bulk_tpm) * 1e6)
flames_bulk_tpm <- fill_missing_matrix(flames_bulk_tpm,
rownames(flames_pseudobulk_tpm))
flames_bulk_tpm <- t(
t(flames_bulk_tpm) / colSums(flames_bulk_tpm) * 1e6
)Select top transcripts for further analysis:
top_transcripts <- apply(flames_pseudobulk_tpm, 1, mean)
top_transcripts <- names(top_transcripts[top_transcripts >= min_tpm])
length(top_transcripts)## [1] 10387
Re-normalize the TPM values:
flames_bulk_tpm <- flames_bulk_tpm[top_transcripts,]
flames_pseudobulk_tpm <- flames_pseudobulk_tpm[top_transcripts,]
flames_bulk_tpm <- t(
t(flames_bulk_tpm) / colSums(flames_bulk_tpm) * 1e6
)
flames_pseudobulk_tpm <- t(
t(flames_pseudobulk_tpm) / colSums(flames_pseudobulk_tpm) * 1e6
)Calculate the correlation matrices:
cor_spearman <- cor(flames_pseudobulk_tpm[top_transcripts,],
flames_bulk_tpm[top_transcripts,],
method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
flames_pseudobulk_tpm[top_transcripts,],
flames_bulk_tpm[top_transcripts,]
)Spearman correlation matrix:
as.data.frame(cor_spearman)Spearman correlation heatmap:
pheatmap(
cor_spearman,
color = heatmap_palette(100, direction = heatmap_palette_direction),
breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)Mean relative difference matrix:
as.data.frame(mean_rel_diff)Mean relative difference heatmap:
pheatmap(
mean_rel_diff,
color = heatmap_palette(100, direction = -1),
breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)TPM scatter plots:
scatterplot_df <- prepare_scatterplot_data(flames_bulk_tpm,
flames_pseudobulk_tpm)
scatterplot_df$tool <- "FLAMES"
scatterplot_df_list[["FLAMES"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)create_scatterplot_combined(scatterplot_df)Prepare Sicelore pseudobulk RNA-Seq data:
sicelore_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
sicelore_df <- read_delim(file.path(
result_dir, glue("sicelore_{sample_id}_Rep1_sicelore_isomatrix.txt")
))
sicelore_df <- sicelore_df[sicelore_df$transcriptId != "undef",]
sicelore_transcript_ids <- sicelore_df$transcriptId
sicelore_counts <- as(as.matrix(sicelore_df[, c(-1, -2, -3)]),
"dgCMatrix")
rownames(sicelore_counts) <- sicelore_transcript_ids
sicelore_pseudobulk_counts <- rowSums(sicelore_counts)
sicelore_pseudobulk_tpm <- sicelore_pseudobulk_counts /
sum(sicelore_pseudobulk_counts) * 1e6
return(sicelore_pseudobulk_tpm)
})
sicelore_pseudobulk_tx_ids <- unique(unlist(sapply(
sicelore_pseudobulk_list, names
)))
sicelore_pseudobulk_tpm <- sapply(sicelore_pseudobulk_list,
function(tpm_values) {
tpm_values <- tpm_values[sicelore_pseudobulk_tx_ids]
tpm_values[is.na(tpm_values)] <- 0
return(tpm_values)
})
rownames(sicelore_pseudobulk_tpm) <- sicelore_pseudobulk_tx_ids
colnames(sicelore_pseudobulk_tpm) <- sample_idsPrepare Sicelore bulk RNA-Seq data:
sicelore_bulk_list <- lapply(sample_ids, function(sample_id) {
sicelore_df <- read_delim(file.path(
result_dir, glue("sicelore_{sample_id}_Rep2_sicelore_isomatrix.txt")
))
sicelore_df <- sicelore_df[sicelore_df$transcriptId != "undef",]
return(sicelore_df)
})
sicelore_bulk_tx_ids <- unique(unlist(sapply(
sicelore_bulk_list, function(df) {df$transcriptId}
)))
sicelore_bulk_tpm <- sapply(sicelore_bulk_list, function(df) {
tpm_values <- df[, 4, drop = TRUE]
names(tpm_values) <- df$transcriptId
tpm_values <- tpm_values[sicelore_bulk_tx_ids]
tpm_values[is.na(tpm_values)] <- 0
return(tpm_values)
})
rownames(sicelore_bulk_tpm) <- sicelore_bulk_tx_ids
colnames(sicelore_bulk_tpm) <- sample_ids
sicelore_bulk_tpm <- t(
t(sicelore_bulk_tpm) / colSums(sicelore_bulk_tpm) * 1e6
)
sicelore_bulk_tpm <- fill_missing_matrix(sicelore_bulk_tpm,
rownames(sicelore_pseudobulk_tpm))
sicelore_bulk_tpm <- t(
t(sicelore_bulk_tpm) / colSums(sicelore_bulk_tpm) * 1e6
)Select top transcripts for further analysis:
top_transcripts <- apply(sicelore_pseudobulk_tpm, 1, mean)
top_transcripts <- names(top_transcripts[top_transcripts >= min_tpm])
length(top_transcripts)## [1] 18032
Re-normalize the TPM values:
sicelore_bulk_tpm <- sicelore_bulk_tpm[top_transcripts,]
sicelore_pseudobulk_tpm <- sicelore_pseudobulk_tpm[top_transcripts,]
sicelore_bulk_tpm <- t(
t(sicelore_bulk_tpm) / colSums(sicelore_bulk_tpm) * 1e6
)
sicelore_pseudobulk_tpm <- t(
t(sicelore_pseudobulk_tpm) / colSums(sicelore_pseudobulk_tpm) * 1e6
)Calculate the correlation matrices:
cor_spearman <- cor(sicelore_pseudobulk_tpm[top_transcripts,],
sicelore_bulk_tpm[top_transcripts,],
method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
sicelore_pseudobulk_tpm[top_transcripts,],
sicelore_bulk_tpm[top_transcripts,]
)Spearman correlation matrix:
as.data.frame(cor_spearman)Spearman correlation heatmap:
pheatmap(
cor_spearman,
color = heatmap_palette(100, direction = heatmap_palette_direction),
breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)Mean relative difference matrix:
as.data.frame(mean_rel_diff)Mean relative difference heatmap:
pheatmap(
mean_rel_diff,
color = heatmap_palette(100, direction = -1),
breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
cluster_rows = FALSE, cluster_cols = FALSE
)TPM scatter plots:
scatterplot_df <- prepare_scatterplot_data(sicelore_bulk_tpm,
sicelore_pseudobulk_tpm)
scatterplot_df$tool <- "Sicelore"
scatterplot_df_list[["Sicelore"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)create_scatterplot_combined(scatterplot_df)Prepare summary barplot data:
plot_df <- do.call(rbind, scatterplot_df_list)
rownames(plot_df) <- NULL
plot_df$tool <- factor(plot_df$tool, levels = c("Isosceles", "IsoQuant",
"FLAMES", "Sicelore"))
plot_df <- plot_df %>%
group_by(tool, sample_bulk, sample_pseudobulk) %>%
summarise(
status = unique(status),
corr_spearman = cor(tpm_bulk, tpm_pseudobulk,
method = "spearman"),
mean_rel_diff = get_mean_rel_diff(tpm_bulk, tpm_pseudobulk)
) %>%
ungroup()
plot_df <- plot_df %>%
group_by(tool, status) %>%
summarise(
corr_spearman_avg = mean(corr_spearman),
corr_spearman_se = sd(corr_spearman) / sqrt(n()),
mean_rel_diff_avg = mean(mean_rel_diff),
mean_rel_diff_se = sd(mean_rel_diff) / sqrt(n())
) %>%
ungroup()Summary barplot (Spearman correlation):
ggplot(plot_df, mapping = aes(x = tool,
y = corr_spearman_avg,
fill = status)) +
geom_col(position = position_dodge(), color = "black", width = 0.5) +
geom_errorbar(
mapping = aes(ymin = corr_spearman_avg - corr_spearman_se,
ymax = corr_spearman_avg + corr_spearman_se),
width = 0.1,
color = "black",
position = position_dodge(0.5)
) +
scale_fill_manual(values = c(`Correct vs Correct` = "grey",
`Correct vs Decoy` = "darkred")) +
coord_cartesian(ylim = c(0, 1)) +
labs(y = "Correlation",
fill = "") +
theme_classic() +
theme(legend.position = "top",
legend.key.size = unit(0.4, "cm"),
aspect.ratio = 1,
axis.title.x = element_blank(),
axis.text.x = element_text(size = 8))Summary barplot (mean relative difference):
ggplot(plot_df, mapping = aes(x = tool,
y = mean_rel_diff_avg,
fill = status)) +
geom_col(position = position_dodge(), color = "black", width = 0.5) +
geom_errorbar(
mapping = aes(ymin = mean_rel_diff_avg - mean_rel_diff_se,
ymax = mean_rel_diff_avg + mean_rel_diff_se),
width = 0.1,
color = "black",
position = position_dodge(0.5)
) +
scale_fill_manual(values = c(`Correct vs Correct` = "grey",
`Correct vs Decoy` = "darkred")) +
coord_cartesian(ylim = c(0, max_mean_rel_diff + 0.1)) +
labs(y = "Mean rel. diff.",
fill = "") +
theme_classic() +
theme(legend.position = "top",
legend.key.size = unit(0.4, "cm"),
aspect.ratio = 1,
axis.title.x = element_blank(),
axis.text.x = element_text(size = 8))Metric difference and CI table:
get_diff_value <- function(x) {
return(abs(x[1] - x[2]))
}
get_diff_sd <- function(x) {
return(sqrt(sum(x ** 2)))
}
get_ci_factor <- function(x) {
qnorm(1 - (1 - x)/2)
}
diff_df <- plot_df %>%
group_by(tool) %>%
summarise(
corr_spearman = get_diff_value(corr_spearman_avg),
corr_spearman_upper_ci_95 = get_diff_value(corr_spearman_avg) +
(get_diff_sd(corr_spearman_se) * get_ci_factor(0.95)),
corr_spearman_lower_ci_95 = get_diff_value(corr_spearman_avg) -
(get_diff_sd(corr_spearman_se) * get_ci_factor(0.95)),
corr_spearman_upper_ci_90 = get_diff_value(corr_spearman_avg) +
(get_diff_sd(corr_spearman_se) * get_ci_factor(0.90)),
corr_spearman_lower_ci_90 = get_diff_value(corr_spearman_avg) -
(get_diff_sd(corr_spearman_se) * get_ci_factor(0.90)),
corr_spearman_upper_ci_68 = get_diff_value(corr_spearman_avg) +
(get_diff_sd(corr_spearman_se) * get_ci_factor(0.68)),
corr_spearman_lower_ci_68 = get_diff_value(corr_spearman_avg) -
(get_diff_sd(corr_spearman_se) * get_ci_factor(0.68)),
mean_rel_diff = get_diff_value(mean_rel_diff_avg),
mean_rel_diff_upper_ci_95 = get_diff_value(mean_rel_diff_avg) +
(get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.95)),
mean_rel_diff_lower_ci_95 = get_diff_value(mean_rel_diff_avg) -
(get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.95)),
mean_rel_diff_upper_ci_90 = get_diff_value(mean_rel_diff_avg) +
(get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.90)),
mean_rel_diff_lower_ci_90 = get_diff_value(mean_rel_diff_avg) -
(get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.90)),
mean_rel_diff_upper_ci_68 = get_diff_value(mean_rel_diff_avg) +
(get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.68)),
mean_rel_diff_lower_ci_68 = get_diff_value(mean_rel_diff_avg) -
(get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.68))
)
diff_df <- as.data.frame(diff_df)
rownames(diff_df) <- diff_df$tool
diff_df$tool <- NULL
diff_df <- as.data.frame(t(diff_df))
diff_dfsessionInfo()## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=C
## [4] LC_COLLATE=C LC_MONETARY=C LC_MESSAGES=C
## [7] LC_PAPER=C LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] viridis_0.6.2 viridisLite_0.4.0
## [3] pheatmap_1.0.12 SingleCellExperiment_1.18.0
## [5] SummarizedExperiment_1.26.1 Biobase_2.56.0
## [7] GenomicRanges_1.48.0 GenomeInfoDb_1.32.2
## [9] IRanges_2.30.0 S4Vectors_0.34.0
## [11] BiocGenerics_0.42.0 MatrixGenerics_1.8.1
## [13] matrixStats_0.62.0 Matrix_1.4-1
## [15] scales_1.2.0 glue_1.6.2
## [17] forcats_0.5.1 stringr_1.4.0
## [19] dplyr_1.0.9 purrr_0.3.4
## [21] readr_2.1.2 tidyr_1.2.0
## [23] tibble_3.1.7 ggplot2_3.3.6
## [25] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 fs_1.5.2 bit64_4.0.5
## [4] lubridate_1.8.0 RColorBrewer_1.1-3 httr_1.4.3
## [7] tools_4.2.0 backports_1.4.1 bslib_0.4.0
## [10] utf8_1.2.2 R6_2.5.1 DBI_1.1.3
## [13] colorspace_2.0-3 withr_2.5.0 tidyselect_1.1.2
## [16] gridExtra_2.3 bit_4.0.4 compiler_4.2.0
## [19] archive_1.1.5 cli_3.3.0 rvest_1.0.2
## [22] xml2_1.3.3 DelayedArray_0.22.0 labeling_0.4.2
## [25] sass_0.4.2 digest_0.6.29 rmarkdown_2.14
## [28] XVector_0.36.0 pkgconfig_2.0.3 htmltools_0.5.3
## [31] dbplyr_2.2.1 fastmap_1.1.0 highr_0.9
## [34] rlang_1.0.4 readxl_1.4.0 rstudioapi_0.13
## [37] jquerylib_0.1.4 generics_0.1.3 farver_2.1.1
## [40] jsonlite_1.8.0 vroom_1.5.7 googlesheets4_1.0.0
## [43] RCurl_1.98-1.8 magrittr_2.0.3 GenomeInfoDbData_1.2.8
## [46] munsell_0.5.0 fansi_1.0.3 lifecycle_1.0.1
## [49] stringi_1.7.8 yaml_2.3.5 zlibbioc_1.42.0
## [52] grid_4.2.0 parallel_4.2.0 crayon_1.5.1
## [55] lattice_0.20-45 haven_2.5.0 hms_1.1.1
## [58] knitr_1.39 pillar_1.7.0 reprex_2.0.1
## [61] evaluate_0.15 modelr_0.1.8 vctrs_0.4.1
## [64] tzdb_0.3.0 cellranger_1.1.0 gtable_0.3.0
## [67] assertthat_0.2.1 cachem_1.0.6 xfun_0.31
## [70] broom_1.0.0 googledrive_2.0.0 gargle_1.2.0
## [73] ellipsis_0.3.2